import os
NOVA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps_Noam/MOmaps'
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/Opera18Days_Reimaged/logs/")
PLOT_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/Opera18Days_Reimaged/logs/plots")
os.chdir(NOVA_HOME)
import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript
from tools.preprocessing_tools.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
run_calc_hist_new
from tools.preprocessing_tools.qc_config import opera18days_panels, opera18days_markers, opera18days_marker_info, \
opera18days_cell_lines, opera18days_cell_lines_to_cond,\
opera18days_cell_lines_for_disp, opera18days_reps, \
opera18days_line_colors, opera18days_lines_order, \
opera18days_custom_palette, opera18days_expected_dapi_raw, \
markers
%load_ext autoreload
%autoreload 2
df = log_files_qc(LOGS_PATH,only_wt_cond=False, filename_split='-',site_location=0)
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch2 reading logs of batch1 Total of 2 files were read. Before dup handeling (120169, 22) After duplication removal #1: (120169, 23) After duplication removal #2: (120169, 23)
# choose batches
batches = ['batch1', 'batch2']
batches
['batch1', 'batch2']
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'Opera18DaysReimaged_sorted')
batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, opera18days_panels, opera18days_markers.copy(),PLOT_PATH, opera18days_marker_info,
opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
batches=batches_raw, fig_height=12)
batch1 Folder structure is valid. No bad files are found. Total Sites: 82000
======== batch2 Folder structure is valid. No bad files are found. Total Sites: 82000
======== ====================
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'Opera18DaysReimaged')
procs = run_validate_folder_structure(root_directory_proc, True, opera18days_panels, opera18days_markers,PLOT_PATH,opera18days_marker_info,
opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
batches=batches, fig_height=12)
batch1 Folder structure is valid. No bad files are found. Total Sites: 53458
======== batch2 Folder structure is valid. No bad files are found. Total Sites: 51021
======== ====================
display_diff(batches, raws, procs, PLOT_PATH,fig_height=12)
batch1
======== batch2
========
#for batch in list(range(3,9)) + ['7_16bit','8_16bit','9_16bit']:
for batch in batches:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc, batch,
sample_size_per_markers=200, cond_count=2, rep_count=len(opera18days_reps),
num_markers=len(opera18days_markers))
print(f'{batch} var: ',var)
batch1 var: 0.023519648146724494 batch2 var: 0.023363238729173504
By order of filtering
Percentage out of the total sites
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, opera18days_line_colors, opera18days_panels,
figsize=(10,6), reps = opera18days_reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner,
opera18days_line_colors, opera18days_panels, reps = opera18days_reps,
figsize=(10,6))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose,
opera18days_line_colors, opera18days_panels, figsize=(10,6),
reps = opera18days_reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
figsize=(10,10), markers=opera18days_markers)
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, opera18days_markers)
## Are we using FMRP?
markers_for_d18 = markers.copy()
markers_for_d18.remove('DAPI')
markers_for_d18.remove('TIA1')
total_sum[total_sum.marker.isin(markers_for_d18)].n_valid_tiles.sum()
144424
total_sum[total_sum.marker == 'DAPI'].n_valid_tiles.sum()
74116
total_sum[total_sum.marker.isin(markers_for_d18)].site_cell_count_sum.sum()
894423.0
total_sum[total_sum.marker == 'DAPI'].site_cell_count_sum.sum()
526900.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch1 | ||||
| count | 820.000000 | 820.000000 | 820.000000 | 820.000000 |
| mean | 166.217073 | 1.662171 | 189.142683 | 594.303659 |
| std | 107.340473 | 1.073405 | 197.388148 | 470.461480 |
| min | 15.000000 | 0.150000 | 15.000000 | 35.000000 |
| 25% | 98.750000 | 0.987500 | 100.750000 | 341.750000 |
| 50% | 154.500000 | 1.545000 | 159.000000 | 539.500000 |
| 75% | 210.500000 | 2.105000 | 221.000000 | 797.000000 |
| max | 1059.000000 | 10.590000 | 2375.000000 | 5331.000000 |
| sum | 136298.000000 | NaN | 155097.000000 | 487329.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch2 | ||||
| count | 820.000000 | 820.000000 | 820.000000 | 820.000000 |
| mean | 142.000000 | 1.420000 | 148.734146 | 508.045122 |
| std | 79.567789 | 0.795678 | 96.995802 | 323.661039 |
| min | 8.000000 | 0.080000 | 9.000000 | 8.000000 |
| 25% | 83.000000 | 0.830000 | 84.000000 | 274.000000 |
| 50% | 127.000000 | 1.270000 | 128.500000 | 460.000000 |
| 75% | 193.000000 | 1.930000 | 197.000000 | 717.250000 |
| max | 658.000000 | 6.580000 | 952.000000 | 2801.000000 |
| sum | 116440.000000 | NaN | 121962.000000 | 416597.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 1640.000000 | 1640.000000 | 1640.000000 | 1640.000000 |
| mean | 154.108537 | 1.541085 | 168.938415 | 551.174390 |
| std | 95.224812 | 0.952248 | 156.776397 | 405.964317 |
| min | 8.000000 | 0.080000 | 9.000000 | 8.000000 |
| 25% | 90.000000 | 0.900000 | 92.000000 | 292.000000 |
| 50% | 140.000000 | 1.400000 | 143.000000 | 508.000000 |
| 75% | 203.000000 | 2.030000 | 209.250000 | 760.000000 |
| max | 1059.000000 | 10.590000 | 2375.000000 | 5331.000000 |
| sum | 252738.000000 | NaN | 277059.000000 | 903926.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
For each batch, cell line, replicate and markerTotal number of tiles
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond FUSHeterozygous Untreated 1.802519 FUSHomozygous Untreated 1.866768 FUSHomozygous stress 1.528936 FUSRevertant Untreated 1.170026 OPTN Untreated 1.621651 SNCA Untreated 1.443494 TBK1 Untreated 1.256183 TDP43 Untreated 1.368266 WT Untreated 2.208756 WT stress 1.687768 Name: n_valid_tiles, dtype: float64
plot_catplot(df_dapi, opera18days_custom_palette, opera18days_reps,
x='n_valid_tiles', x_title='valid tiles count', batch_min=1, batch_max=2)
plot_hm(df_dapi, split_by='rep', rows='cell_line', columns='panel')
for batch in batches:
print(batch)
run_calc_hist_new(f'{batch}',opera18days_cell_lines_for_disp, opera18days_markers,
root_directory_raw, root_directory_proc, hist_sample=10,
sample_size_per_markers=200, ncols=7, nrows=5)
print("="*30)
batch1
============================== batch2
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system('jupyter nbconvert --to html src/preprocessing/notebooks/qc_report_Opera18DaysReimaged.ipynb')